This is analysis for proteomics dataset generated using mouse colon tumor samples from Villin-CreER;Apc(fl/fl);KRasWT and Villin-CreER;Apc(fl/fl);KRasG12D mice. Tumor was induced through enema of 4-OHT in experimental and control mice. Emily Poulin harvested the tissue samples and the proteomics were performed by Joao Paulo. Part of the analysis code was adapted from original script from Shikha Sheth.
suppressMessages(
c(library(gridExtra),
library(ensembldb),
library(EnsDb.Mmusculus.v79),
library(grid),
library(ggplot2),
library(lattice),
library(reshape),
library(mixOmics),
library(gplots),
library(RColorBrewer),
library(readr),
library(dplyr),
library(VennDiagram),
library(clusterProfiler),
library(DOSE),
library(org.Mm.eg.db),
library(pathview),
library(AnnotationDbi),
library(tidyr),
library(qdapRegex),
library(gtools),
library(ggfortify))
)
## [1] "gridExtra" "stats" "graphics"
## [4] "grDevices" "utils" "datasets"
## [7] "methods" "base" "ensembldb"
## [10] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [13] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [16] "IRanges" "S4Vectors" "stats4"
## [19] "BiocGenerics" "parallel" "gridExtra"
## [22] "stats" "graphics" "grDevices"
## [25] "utils" "datasets" "methods"
## [28] "base" "EnsDb.Mmusculus.v79" "ensembldb"
## [31] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [34] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [37] "IRanges" "S4Vectors" "stats4"
## [40] "BiocGenerics" "parallel" "gridExtra"
## [43] "stats" "graphics" "grDevices"
## [46] "utils" "datasets" "methods"
## [49] "base" "grid" "EnsDb.Mmusculus.v79"
## [52] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [55] "AnnotationDbi" "Biobase" "GenomicRanges"
## [58] "GenomeInfoDb" "IRanges" "S4Vectors"
## [61] "stats4" "BiocGenerics" "parallel"
## [64] "gridExtra" "stats" "graphics"
## [67] "grDevices" "utils" "datasets"
## [70] "methods" "base" "ggplot2"
## [73] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [76] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [79] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [82] "IRanges" "S4Vectors" "stats4"
## [85] "BiocGenerics" "parallel" "gridExtra"
## [88] "stats" "graphics" "grDevices"
## [91] "utils" "datasets" "methods"
## [94] "base" "lattice" "ggplot2"
## [97] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [100] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [103] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [106] "IRanges" "S4Vectors" "stats4"
## [109] "BiocGenerics" "parallel" "gridExtra"
## [112] "stats" "graphics" "grDevices"
## [115] "utils" "datasets" "methods"
## [118] "base" "reshape" "lattice"
## [121] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [124] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [127] "AnnotationDbi" "Biobase" "GenomicRanges"
## [130] "GenomeInfoDb" "IRanges" "S4Vectors"
## [133] "stats4" "BiocGenerics" "parallel"
## [136] "gridExtra" "stats" "graphics"
## [139] "grDevices" "utils" "datasets"
## [142] "methods" "base" "mixOmics"
## [145] "MASS" "reshape" "lattice"
## [148] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [151] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [154] "AnnotationDbi" "Biobase" "GenomicRanges"
## [157] "GenomeInfoDb" "IRanges" "S4Vectors"
## [160] "stats4" "BiocGenerics" "parallel"
## [163] "gridExtra" "stats" "graphics"
## [166] "grDevices" "utils" "datasets"
## [169] "methods" "base" "gplots"
## [172] "mixOmics" "MASS" "reshape"
## [175] "lattice" "ggplot2" "grid"
## [178] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [181] "GenomicFeatures" "AnnotationDbi" "Biobase"
## [184] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [187] "S4Vectors" "stats4" "BiocGenerics"
## [190] "parallel" "gridExtra" "stats"
## [193] "graphics" "grDevices" "utils"
## [196] "datasets" "methods" "base"
## [199] "RColorBrewer" "gplots" "mixOmics"
## [202] "MASS" "reshape" "lattice"
## [205] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [208] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [211] "AnnotationDbi" "Biobase" "GenomicRanges"
## [214] "GenomeInfoDb" "IRanges" "S4Vectors"
## [217] "stats4" "BiocGenerics" "parallel"
## [220] "gridExtra" "stats" "graphics"
## [223] "grDevices" "utils" "datasets"
## [226] "methods" "base" "readr"
## [229] "RColorBrewer" "gplots" "mixOmics"
## [232] "MASS" "reshape" "lattice"
## [235] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [238] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [241] "AnnotationDbi" "Biobase" "GenomicRanges"
## [244] "GenomeInfoDb" "IRanges" "S4Vectors"
## [247] "stats4" "BiocGenerics" "parallel"
## [250] "gridExtra" "stats" "graphics"
## [253] "grDevices" "utils" "datasets"
## [256] "methods" "base" "dplyr"
## [259] "readr" "RColorBrewer" "gplots"
## [262] "mixOmics" "MASS" "reshape"
## [265] "lattice" "ggplot2" "grid"
## [268] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [271] "GenomicFeatures" "AnnotationDbi" "Biobase"
## [274] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [277] "S4Vectors" "stats4" "BiocGenerics"
## [280] "parallel" "gridExtra" "stats"
## [283] "graphics" "grDevices" "utils"
## [286] "datasets" "methods" "base"
## [289] "VennDiagram" "futile.logger" "dplyr"
## [292] "readr" "RColorBrewer" "gplots"
## [295] "mixOmics" "MASS" "reshape"
## [298] "lattice" "ggplot2" "grid"
## [301] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [304] "GenomicFeatures" "AnnotationDbi" "Biobase"
## [307] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [310] "S4Vectors" "stats4" "BiocGenerics"
## [313] "parallel" "gridExtra" "stats"
## [316] "graphics" "grDevices" "utils"
## [319] "datasets" "methods" "base"
## [322] "clusterProfiler" "VennDiagram" "futile.logger"
## [325] "dplyr" "readr" "RColorBrewer"
## [328] "gplots" "mixOmics" "MASS"
## [331] "reshape" "lattice" "ggplot2"
## [334] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [337] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [340] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [343] "IRanges" "S4Vectors" "stats4"
## [346] "BiocGenerics" "parallel" "gridExtra"
## [349] "stats" "graphics" "grDevices"
## [352] "utils" "datasets" "methods"
## [355] "base" "DOSE" "clusterProfiler"
## [358] "VennDiagram" "futile.logger" "dplyr"
## [361] "readr" "RColorBrewer" "gplots"
## [364] "mixOmics" "MASS" "reshape"
## [367] "lattice" "ggplot2" "grid"
## [370] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [373] "GenomicFeatures" "AnnotationDbi" "Biobase"
## [376] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [379] "S4Vectors" "stats4" "BiocGenerics"
## [382] "parallel" "gridExtra" "stats"
## [385] "graphics" "grDevices" "utils"
## [388] "datasets" "methods" "base"
## [391] "org.Mm.eg.db" "DOSE" "clusterProfiler"
## [394] "VennDiagram" "futile.logger" "dplyr"
## [397] "readr" "RColorBrewer" "gplots"
## [400] "mixOmics" "MASS" "reshape"
## [403] "lattice" "ggplot2" "grid"
## [406] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [409] "GenomicFeatures" "AnnotationDbi" "Biobase"
## [412] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [415] "S4Vectors" "stats4" "BiocGenerics"
## [418] "parallel" "gridExtra" "stats"
## [421] "graphics" "grDevices" "utils"
## [424] "datasets" "methods" "base"
## [427] "pathview" "org.Mm.eg.db" "DOSE"
## [430] "clusterProfiler" "VennDiagram" "futile.logger"
## [433] "dplyr" "readr" "RColorBrewer"
## [436] "gplots" "mixOmics" "MASS"
## [439] "reshape" "lattice" "ggplot2"
## [442] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [445] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [448] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [451] "IRanges" "S4Vectors" "stats4"
## [454] "BiocGenerics" "parallel" "gridExtra"
## [457] "stats" "graphics" "grDevices"
## [460] "utils" "datasets" "methods"
## [463] "base" "pathview" "org.Mm.eg.db"
## [466] "DOSE" "clusterProfiler" "VennDiagram"
## [469] "futile.logger" "dplyr" "readr"
## [472] "RColorBrewer" "gplots" "mixOmics"
## [475] "MASS" "reshape" "lattice"
## [478] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [481] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [484] "AnnotationDbi" "Biobase" "GenomicRanges"
## [487] "GenomeInfoDb" "IRanges" "S4Vectors"
## [490] "stats4" "BiocGenerics" "parallel"
## [493] "gridExtra" "stats" "graphics"
## [496] "grDevices" "utils" "datasets"
## [499] "methods" "base" "tidyr"
## [502] "pathview" "org.Mm.eg.db" "DOSE"
## [505] "clusterProfiler" "VennDiagram" "futile.logger"
## [508] "dplyr" "readr" "RColorBrewer"
## [511] "gplots" "mixOmics" "MASS"
## [514] "reshape" "lattice" "ggplot2"
## [517] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [520] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [523] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [526] "IRanges" "S4Vectors" "stats4"
## [529] "BiocGenerics" "parallel" "gridExtra"
## [532] "stats" "graphics" "grDevices"
## [535] "utils" "datasets" "methods"
## [538] "base" "qdapRegex" "tidyr"
## [541] "pathview" "org.Mm.eg.db" "DOSE"
## [544] "clusterProfiler" "VennDiagram" "futile.logger"
## [547] "dplyr" "readr" "RColorBrewer"
## [550] "gplots" "mixOmics" "MASS"
## [553] "reshape" "lattice" "ggplot2"
## [556] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [559] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [562] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [565] "IRanges" "S4Vectors" "stats4"
## [568] "BiocGenerics" "parallel" "gridExtra"
## [571] "stats" "graphics" "grDevices"
## [574] "utils" "datasets" "methods"
## [577] "base" "gtools" "qdapRegex"
## [580] "tidyr" "pathview" "org.Mm.eg.db"
## [583] "DOSE" "clusterProfiler" "VennDiagram"
## [586] "futile.logger" "dplyr" "readr"
## [589] "RColorBrewer" "gplots" "mixOmics"
## [592] "MASS" "reshape" "lattice"
## [595] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [598] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [601] "AnnotationDbi" "Biobase" "GenomicRanges"
## [604] "GenomeInfoDb" "IRanges" "S4Vectors"
## [607] "stats4" "BiocGenerics" "parallel"
## [610] "gridExtra" "stats" "graphics"
## [613] "grDevices" "utils" "datasets"
## [616] "methods" "base" "ggfortify"
## [619] "gtools" "qdapRegex" "tidyr"
## [622] "pathview" "org.Mm.eg.db" "DOSE"
## [625] "clusterProfiler" "VennDiagram" "futile.logger"
## [628] "dplyr" "readr" "RColorBrewer"
## [631] "gplots" "mixOmics" "MASS"
## [634] "reshape" "lattice" "ggplot2"
## [637] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [640] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [643] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [646] "IRanges" "S4Vectors" "stats4"
## [649] "BiocGenerics" "parallel" "gridExtra"
## [652] "stats" "graphics" "grDevices"
## [655] "utils" "datasets" "methods"
## [658] "base"
The original dataset was loaded.
tumor_proteom <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/colon tumor-enema model/2017-03-19_Haigis_5v5_Pro.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## `Protein Id` = col_character(),
## Description = col_character(),
## control1 = col_double(),
## control2 = col_double(),
## control3 = col_double(),
## control4 = col_double(),
## control5 = col_double(),
## KRAS1 = col_double(),
## KRAS2 = col_double(),
## KRAS3 = col_double(),
## KRAS4 = col_double(),
## KRAS5 = col_double()
## )
# establish a new data frame for collecting stats
tumor_stats <- tumor_proteom[,1:2]
# Calculate the pvalue using parametric unpaired t test
p_value_list <- c()
for (i in 1:dim(tumor_proteom)[1]) {
p_value <- t.test(unlist(tumor_proteom[i,8:12]), unlist(tumor_proteom[i,3:7]), paired = FALSE)$p.value
p_value_list <- c(p_value_list, p_value)
}
tumor_stats <- cbind(tumor_stats, p_value_list)
colnames(tumor_stats)[3] <- "p_values"
# calculate the q value using Benjamini Hochberg FDR correction
q_value_list <- p.adjust(tumor_stats$p_values, method = "BH")
tumor_stats <- cbind(tumor_stats, q_value_list)
colnames(tumor_stats)[4] <- "q_values"
# calculate fold change and log fold change
foldchange_list <- c()
for (i in 1:dim(tumor_proteom)[1]) {
foldchange <- foldchange(mean(unlist(tumor_proteom[i,8:12])), mean(unlist(tumor_proteom[i,3:7])))
foldchange_list <- c(foldchange_list, foldchange)
}
logfoldchange_list <- foldchange2logratio(foldchange_list)
tumor_stats <- cbind(tumor_stats, foldchange_list, logfoldchange_list)
colnames(tumor_stats)[5:6] <- c("foldChange", "LFC")
Now we output this statistical analysis file into a csv file.
write.csv(tumor_stats, "crcMS_diff.csv", col.names = NULL)
Just to check how many siginificant proteins do we have based on p< 0.05 and q< 0.1
sig_dif_stats <- subset(tumor_stats, tumor_stats$p_values <= 0.05 & tumor_stats$q_values <= 0.1)
dim(sig_dif_stats)[1]
## [1] 2471
So there are 2471 proteins identified to have significantly different expression between G12D/WT.
Since the number of significant changes are quite small, I want to use PCA to check how the samples cluster.
dir.create("PDF_figure", showWarnings = FALSE)
df <- tumor_proteom[3:12]
df <- as.data.frame(t(df))
df <- cbind(df, c('WT','WT','WT','WT','WT','G12D','G12D','G12D','G12D','G12D'))
colnames(df)[8186] <- 'Genotype'
df.set <- as.matrix(df[,1:8185])
df.pca <- prcomp(df.set, center = TRUE, scale = TRUE)
autoplot(df.pca, data = df, colour = 'Genotype') +
geom_text(aes(label=rownames(df)), vjust = 2, hjust = -0.1) +
xlim(-0.5, 0.6) + ylim(-0.7, 0.6)
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## Please use `select()` instead.
pdf('PDF_figure/PCA.pdf',
height = 4,
width = 6)
autoplot(df.pca, data = df, colour = 'Genotype') +
geom_text(aes(label=rownames(df)), vjust = 2, hjust = -0.1) +
xlim(-0.5, 0.6) + ylim(-0.7, 0.6)
dev.off()
## quartz_off_screen
## 2
pseudoCount = log2(tumor_proteom[3:12])
# remove NA, NaN, Inf values from the dataframe
pseudoCount <- na.omit(pseudoCount)
pseudoCount <- pseudoCount[is.finite(rowSums(pseudoCount)),]
mat.dist = pseudoCount
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/colon tumor-enema model")
png('Hierchical_Clustering.png')
cim(mat.dist, symkey = FALSE, margins = c(10, 10 ))
suppressMessages(dev.off())
## quartz_off_screen
## 2
pdf('PDF_figure/Hierchical_Clustering.pdf')
cim(mat.dist, symkey = FALSE, margins = c(7, 7))
dev.off()
## quartz_off_screen
## 2
Final output is following:
For heatmap, I will z-score all quantifications across all samples for the same protein. Heatmaps for all proteins with p<0.05 and q < 0.1 are plotted
suppressMessages(library(mosaic))
sig_index <- c()
duplicate <- c()
for (i in 1:dim(sig_dif_stats)[1]) {
index <- grep(sig_dif_stats$`Protein Id`[i], tumor_proteom$`Protein Id`)
if (length(index) == 1) {
sig_index <- c(sig_index, index)
}
else {
duplicate <- c(duplicate, i)
}
}
sig_count <- tumor_proteom[sig_index,]
sig_dif <- cbind(sig_dif_stats[-duplicate,], sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,9:18] <- zscore(as.numeric(sig_dif[i,9:18]))
}
my_palette <- colorRampPalette(c("blue", "white", "red"))(256)
heatmap_matrix <- as.matrix(sig_dif[,9:18])
png('G12D vs WT colon tumor proteomics.png',
width = 600,
height = 1400,
res = 200,
pointsize = 8)
par(cex.main=1.1)
heatmap.2(heatmap_matrix,
main = "DE proteins in colon\ntumor(enema model)\n(G12D/WT) p < 0.05 q < 0.1",
density.info = "none",
key = TRUE,
lwid = c(3,7),
lhei = c(1,7),
col=my_palette,
margins = c(8,2),
symbreaks = TRUE,
trace = "none",
dendrogram = "row",
labRow = FALSE,
ylab = "Proteins",
cexCol = 1.5,
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/Heatmap.pdf',
width = 6,
height = 14)
heatmap.2(heatmap_matrix,
main = "DE proteins in colon\ntumor(enema model)\n(G12D/WT) p < 0.05 q < 0.1",
density.info = "none",
key = TRUE,
lwid = c(3,7),
lhei = c(1,7),
col=my_palette,
margins = c(8,2),
symbreaks = TRUE,
trace = "none",
dendrogram = "row",
labRow = FALSE,
ylab = "Proteins",
cexCol = 1.5,
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
Final output is
# Scatter plot
tumor_stats$KrasG12D_mean <- rowMeans(log2(tumor_proteom[,8:12]))
tumor_stats$KrasWT_mean <- rowMeans(log2(tumor_proteom[,3:7]))
ggplot(tumor_stats, aes(x = KrasWT_mean, y = KrasG12D_mean)) +
xlab("WT_Average(log2)") + ylab("G12D_Average(log2)") +
geom_point(data = tumor_stats, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(tumor_stats,tumor_stats$p_values < 0.05 & tumor_stats$q_values < 0.1 & tumor_stats$LFC > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(tumor_stats,tumor_stats$p_values < 0.05 & tumor_stats$q_values < 0.1 & tumor_stats$LFC < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT Scatter Plot")
pdf('PDF_figure/Scatter_Plot.pdf',
width = 5,
height = 4)
ggplot(tumor_stats, aes(x = KrasWT_mean, y = KrasG12D_mean)) +
xlab("WT_Average(log2)") + ylab("G12D_Average(log2)") +
geom_point(data = tumor_stats, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(tumor_stats,tumor_stats$p_values < 0.05 & tumor_stats$q_values < 0.1 & tumor_stats$LFC > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(tumor_stats,tumor_stats$p_values < 0.05 & tumor_stats$q_values < 0.1 & tumor_stats$LFC < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT Scatter Plot")
dev.off()
## quartz_off_screen
## 2
# MA plot
tumor_stats$'baseMean' <- rowMeans(tumor_proteom[,3:12])
tumor_stats$'log2baseMean' <- log(tumor_stats$`baseMean`,2)
red_subset <- subset(tumor_stats,tumor_stats$p_values < 0.05 & tumor_stats$q_values < 0.1 & tumor_stats$LFC > 0)
blue_subset <- subset(tumor_stats,tumor_stats$p_values < 0.05 & tumor_stats$q_values < 0.1 & tumor_stats$LFC < 0)
ggplot(tumor_stats, aes(x = tumor_stats$`log2baseMean`, y = tumor_stats$`LFC`)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = tumor_stats, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = red_subset, aes(x=red_subset$`log2baseMean`, y=red_subset$`LFC`), alpha = 0.5, size = 1, color = "red") +
geom_point(data = blue_subset, aes(x=blue_subset$`log2baseMean`, y=blue_subset$`LFC`), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT MA Plot")
## Warning: Use of `tumor_stats$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `tumor_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.
pdf('PDF_figure/MA_Plot.pdf',
width = 5,
height = 4)
ggplot(tumor_stats, aes(x = tumor_stats$`log2baseMean`, y = tumor_stats$`LFC`)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = tumor_stats, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = red_subset, aes(x=red_subset$`log2baseMean`, y=red_subset$`LFC`), alpha = 0.5, size = 1, color = "red") +
geom_point(data = blue_subset, aes(x=blue_subset$`log2baseMean`, y=blue_subset$`LFC`), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT MA Plot")
## Warning: Use of `tumor_stats$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `tumor_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.
dev.off()
## quartz_off_screen
## 2
# Volcano Plot
ggplot(tumor_stats, aes(x = tumor_stats$`LFC`, y = -log(tumor_stats$`p_values`,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = tumor_stats, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = red_subset, aes(x=red_subset$`LFC`, y=-log(red_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "red") +
geom_point(data = blue_subset, aes(x=blue_subset$`LFC`, y=-log(blue_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT Volcano Plot")
## Warning: Use of `tumor_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `tumor_stats$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$p_values` is discouraged. Use `p_values` instead.
pdf('PDF_figure/Volcano_Plot.pdf',
width = 5,
height = 4)
ggplot(tumor_stats, aes(x = tumor_stats$`LFC`, y = -log(tumor_stats$`p_values`,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = tumor_stats, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = red_subset, aes(x=red_subset$`LFC`, y=-log(red_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "red") +
geom_point(data = blue_subset, aes(x=blue_subset$`LFC`, y=-log(blue_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT Volcano Plot")
## Warning: Use of `tumor_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `tumor_stats$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$p_values` is discouraged. Use `p_values` instead.
dev.off()
## quartz_off_screen
## 2
For the Go analysis, I am using all proteins that have a p<0.05 and q < 0.1.
target_gene <- as.character(sig_dif_stats$`Protein Id`)
detected_gene <- as.character(tumor_proteom$`Protein Id`)
# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "UNIPROT",
OrgDb = org.Mm.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)
dim(cluster_summary_BP)[1]
## [1] 284
write.csv(cluster_summary_BP, "GO/GO analysis_BP.csv")
# Run GO enrichment analysis for molecular function
ego_MF <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "UNIPROT",
OrgDb = org.Mm.eg.db,
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)
dim(cluster_summary_MF)[1]
## [1] 16
write.csv(cluster_summary_MF, "GO/GO analysis_MF.csv")
# Run GO enrichment analysis for cellular component
ego_CC <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "UNIPROT",
OrgDb = org.Mm.eg.db,
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)
dim(cluster_summary_CC)[1]
## [1] 40
write.csv(cluster_summary_CC, "GO/GO analysis_CC.csv")
png('GO/GO dotplot_BP.png',
width = 1400,
height = 1600,
res = 100,
pointsize = 8)
dotplot(ego_BP, showCategory=50)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO dotplot_BP.pdf',
width = 14,
height = 16)
dotplot(ego_BP, showCategory=50)
dev.off()
## quartz_off_screen
## 2
Final output is following: #### Molecular Function
png('GO/GO dotplot_MF.png',
width = 1000,
height = 800,
res = 100,
pointsize = 8)
dotplot(ego_MF, showCategory=50)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO dotplot_MF.pdf',
width = 10,
height = 8)
dotplot(ego_MF, showCategory=50)
dev.off()
## quartz_off_screen
## 2
Final output is following: #### Cellular Component
png('GO/GO dotplot_CC.png',
width = 1400,
height = 1600,
res = 100,
pointsize = 8)
dotplot(ego_CC, showCategory=50)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO dotplot_CC.pdf',
width = 14,
height = 16)
dotplot(ego_CC, showCategory=50)
dev.off()
## quartz_off_screen
## 2
Final output is following:
png('GO/GO enrichment_BP.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO enrichment_BP.pdf',
width = 16,
height = 16)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
Final output is following: #### Molecular Function
png('GO/GO enrichment_MF.png',
width = 1000,
height = 1000,
res = 100,
pointsize = 8)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO enrichment_MF.pdf',
width = 10,
height = 10)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
Final output is following: #### Cellular Component
png('GO/GO enrichment_CC.png',
width = 1000,
height = 1000,
res = 100,
pointsize = 8)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO enrichment_CC.pdf',
width = 10,
height = 10)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
Final output is following:
The category netplot shows the relationships between the genes associated with the top five most significant GO terms and the fold changes of the significant genes associated with these terms (color).
# annotate the tumor stats with gene symbol
annotations_edb <- AnnotationDbi::select(org.Mm.eg.db,
keys = tumor_stats$`Protein Id`,
columns = c("ENSEMBL", "SYMBOL"),
keytype = "UNIPROT")
## 'select()' returned 1:many mapping between keys and columns
# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_edb$UNIPROT) == FALSE)
non_duplicates_idx <- which(duplicated(annotations_edb$ENSEMBL) == FALSE)
non_duplicates_idx <- which(is.na(annotations_edb$ENSEMBL) == FALSE)
# Return only the non-duplicated genes using indices
annotations_edb <- annotations_edb[non_duplicates_idx, ]
# Check number of NAs returned
is.na(annotations_edb$ENSEMBL) %>%
which() %>%
length()
## [1] 0
# Join the Ensembl annotation to proteomics quantification
tumor_stats <- inner_join(tumor_stats, annotations_edb, by=c("Protein Id"="UNIPROT"))
OE_foldchanges <- tumor_stats$LFC
names(OE_foldchanges) <- tumor_stats$SYMBOL
png('GO/GO cnetplot_BP.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
cnetplot(ego_BP,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO cnetplot_BP.pdf',
width = 16,
height = 16)
cnetplot(ego_BP,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
Final output is following: #### Molecular Function
png('GO/GO cnetplot_MF.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
cnetplot(ego_MF,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO cnetplot_MF.pdf',
width = 16,
height = 16)
cnetplot(ego_MF,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
Final output is following: #### Cellular Component
png('GO/GO cnetplot_CC.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
cnetplot(ego_CC,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO cnetplot_CC.pdf',
width = 16,
height = 16)
cnetplot(ego_CC,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
Final output is following:
I need to generate a quantification file that also has ensembl ID with them.
tumor_gsea <- inner_join(tumor_proteom, annotations_edb, by=c("Protein Id"="UNIPROT"))
write.csv(tumor_gsea, "GSEA/Raw Quantification.csv")